# download_jios_issue.py
# JIOS (Journal of Information and Organizational Sciences) Downloader
# Automates downloading PDFs from JIOS (OJS 3.x platform)
# - Parses issue Table of Contents
# - Visits each article page to extract direct PDF links
# - Skips "Cover" and "Editor's Section" items by section header
# - Creates folders dynamically using Vol/Issue/Year extracted from <title>
# - Ensures Windows-safe filenames and logs all downloads into a CSV file

import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---------- Helpers ----------
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

def extract_volume_issue_year(title_text):
    vol_match = re.search(r'Vol\.\s*(\d+)', title_text, re.I)
    issue_match = re.search(r'No\.\s*(\d+)', title_text, re.I)
    year_match = re.search(r'\((\d{4})\)', title_text)
    vol = vol_match.group(1) if vol_match else "Vol"
    issue = issue_match.group(1) if issue_match else "Issue"
    year = year_match.group(1) if year_match else "Year"
    return f"JIOS_Vol{vol}_Issue{issue}_{year}"

# ---------- Input ----------
issue_url = input("Enter JIOS issue URL: ").strip()

# ---------- Fetch Issue Page ----------
print(f"[INFO] Fetching issue page: {issue_url}")
resp = requests.get(issue_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# ---------- Folder Naming ----------
title_tag = soup.find("title")
if title_tag:
    folder_name = extract_volume_issue_year(title_tag.get_text())
else:
    folder_name = "JIOS_Issue"
os.makedirs(folder_name, exist_ok=True)

# ---------- Find Sections ----------
sections = soup.find_all("div", class_="section")
print(f"[INFO] Found {len(sections)} sections")

log_path = os.path.join(folder_name, f"{folder_name}_log.csv")
log_file = open(log_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["Title", "Article URL", "PDF URL", "Status"])

count = 0
downloaded_pdfs = set()

for section in sections:
    section_header = section.find("h2")
    if section_header:
        sec_title = section_header.get_text(strip=True)
        if re.search(r'Cover|Editor', sec_title, re.I):
            print(f"[SKIP SECTION] {sec_title}")
            continue

    articles = section.find_all("li", class_="article-summary")
    if not articles:
        articles = section.find_all("div", class_="obj_article_summary")

    for art in articles:
        title_tag = art.find("h3", class_="title")
        if not title_tag or not title_tag.a:
            continue
        title = title_tag.a.get_text(strip=True)
        article_url = urljoin(issue_url, title_tag.a["href"])  # ✅ Fix URL join

        pdf_link = ""

        try:
            # ---------- Fetch Article Page ----------
            art_resp = requests.get(article_url)
            art_resp.raise_for_status()
            art_soup = BeautifulSoup(art_resp.text, "html.parser")

            for a in art_soup.find_all("a", class_="obj_galley_link"):
                if "pdf" in a.get_text(strip=True).lower():
                    pdf_link = urljoin(article_url, a["href"])
                    break

            if not pdf_link:
                print(f"[SKIP] No PDF for: {title}")
                csv_writer.writerow([title, article_url, "", "No PDF"])
                continue

            if pdf_link in downloaded_pdfs:
                print(f"[SKIP DUPLICATE] {title}")
                csv_writer.writerow([title, article_url, pdf_link, "Skipped (Duplicate)"])
                continue
            downloaded_pdfs.add(pdf_link)

            clean_title = sanitize_filename(title)
            pdf_path = os.path.join(folder_name, f"{clean_title}.pdf")

            print(f"[{count+1}] Downloading: {clean_title}")
            r = requests.get(pdf_link)
            r.raise_for_status()
            with open(pdf_path, "wb") as f:
                f.write(r.content)

            csv_writer.writerow([title, article_url, pdf_link, "OK"])
            count += 1

        except Exception as e:
            print(f"[ERROR] Failed: {title} - {e}")
            csv_writer.writerow([title, article_url, pdf_link, f"Error: {e}"])

log_file.close()
print(f"\nDone! {count} PDFs saved in {folder_name}")
print(f"Log file created: {log_path}")
